import re
from pathlib import Path
import pandas as pd
import stanza

# 1) Pipeline UD para español (ya descargado en pasos anteriores)
nlp = stanza.Pipeline(
    lang="es",
    processors="tokenize,pos,lemma,depparse",
    tokenize_no_ssplit=False
)

# 2) Partir un TXT en respuestas individuales (según encabezados MODELO_XX)
def split_by_headers(text: str, model_prefix: str):
    pattern = rf"({model_prefix}_(\d{{2}}))\s*"
    parts = re.split(pattern, text)
    items = []
    i = 1
    while i < len(parts):
        doc_id = parts[i]           # ej. GEMINI_01
        content = parts[i + 2].strip()
        items.append((doc_id, content))
        i += 3
    return items

# 3) Extraer métricas UD por texto
def extract_ud_metrics(doc_id: str, model: str, text: str):
    doc = nlp(text)

    n_sent = len(doc.sentences)
    n_tokens = sum(len(s.words) for s in doc.sentences)

    rel_counts = {
        "ccomp": 0,        # subordinada completiva finita (aprox. sustantiva)
        "xcomp": 0,        # complemento no finito (aprox. sustantiva no finita)
        "advcl": 0,        # subordinada adverbial
        "acl_relcl": 0,    # relativa (acl:relcl)
        "cc": 0,           # conjunción coordinante
        "conj": 0          # miembro coordinado
    }

    for sent in doc.sentences:
        for w in sent.words:
            rel = w.deprel
            if rel == "ccomp":
                rel_counts["ccomp"] += 1
            elif rel == "xcomp":
                rel_counts["xcomp"] += 1
            elif rel == "advcl":
                rel_counts["advcl"] += 1
            elif rel == "acl:relcl":
                rel_counts["acl_relcl"] += 1
            elif rel == "cc":
                rel_counts["cc"] += 1
            elif rel == "conj":
                rel_counts["conj"] += 1

    per_1000 = {k + "_per1000": (v / n_tokens * 1000 if n_tokens else 0.0)
                for k, v in rel_counts.items()}

    return {
        "doc_id": doc_id,
        "model": model,
        "tokens": n_tokens,
        "sentences": n_sent,
        "tokens_per_sentence": (n_tokens / n_sent if n_sent else 0.0),
        **rel_counts,
        **per_1000
    }

def main():
    # Carpeta donde están los txt
    base = Path("corpus_txt")

    files = {
        "CLAUDE": base / "CLAUDE_01.txt",
        "GEMINI": base / "GEMINI_01.txt",
        "CHATGPT": base / "CHATGPT_01.txt"
    }

    rows = []

    for model, path in files.items():
        text = path.read_text(encoding="utf-8")
        items = split_by_headers(text, model)
        if len(items) == 0:
            raise ValueError(f"No se detectaron encabezados {model}_XX en {path.name}")

        for doc_id, content in items:
            rows.append(extract_ud_metrics(doc_id, model, content))

    df = pd.DataFrame(rows)

    # Guardar CSV completo
    df.to_csv("ud_metrics_by_text.csv", index=False, encoding="utf-8")

    # Resumen por modelo
    summary = df.groupby("model")[[
        "tokens", "sentences", "tokens_per_sentence",
        "ccomp_per1000", "xcomp_per1000", "advcl_per1000", "acl_relcl_per1000",
        "cc_per1000", "conj_per1000"
    ]].agg(["mean", "std"])

    print("\n=== RESUMEN POR MODELO (MEDIA y SD) ===\n")
    print(summary)
    print("\nArchivo generado: ud_metrics_by_text.csv")

if __name__ == "__main__":
    main()